Loading the data

## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
head(data_frame)
## # A tibble: 6 × 20
##    ...1 `Country Name`  Year Agriculture, value added (…¹ CO2 emissions (metri…²
##   <dbl> <chr>          <dbl>                        <dbl>                  <dbl>
## 1     0 Afghanistan     1962                           NA                 0.0738
## 2     1 Afghanistan     1967                           NA                 0.124 
## 3     2 Afghanistan     1972                           NA                 0.131 
## 4     3 Afghanistan     1977                           NA                 0.183 
## 5     4 Afghanistan     1982                           NA                 0.166 
## 6     5 Afghanistan     1987                           NA                 0.276 
## # ℹ abbreviated names: ¹​`Agriculture, value added (% of GDP)`,
## #   ²​`CO2 emissions (metric tons per capita)`
## # ℹ 15 more variables:
## #   `Domestic credit provided by financial sector (% of GDP)` <dbl>,
## #   `Electric power consumption (kWh per capita)` <dbl>,
## #   `Energy use (kg of oil equivalent per capita)` <dbl>,
## #   `Exports of goods and services (% of GDP)` <dbl>, …

Data Visualization


filtered_data = data_frame |>
  filter(Year == 1962) 
  

filtered_data |>
  select(starts_with('CO2'), gdpPercap) |>
  rename(co2 = starts_with('CO2')) |>
  filter(if_all(everything(), ~ !is.na(.))) |>
  ggplot() +
  geom_point(aes(x = gdpPercap, y = co2)) +
  labs(
    x = 'GDP per capita',
    y = 'CO2 emissions (metric tons per capita)',
    title = 'CO2 emissions per capita generally increases with GDP'
  )

correlation between CO2 emission and GDP growth

correlation_columns = filtered_data |>
  select(starts_with('CO2'), gdpPercap) |>
  rename(co2 = starts_with('CO2')) |>
  filter(if_all(everything(), ~ !is.na(.)))

#since the tow variables are quantitative continuous, i used Pearson correlation
#by default the cor() function will use Pearson method
#the function output a value of 0.9260817 indicating there is a positive relation between the two variables (evolving in the same direction)

cor(correlation_columns$co2,
    correlation_columns$gdpPercap,
    method = 'pearson')
## [1] 0.9260817

#the p_vlaue is used to assess the probability of getting a correlation coefficient as extreme as r, if we sample from a population when the null hypothesis r = 0 is true

cor.test(correlation_columns$co2,
    correlation_columns$gdpPercap,
    method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  correlation_columns$co2 and correlation_columns$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817

data_frame |>
  select(
    Year,
    co2 = starts_with('CO2'),
    gdpPercap
  ) |>
  filter(!is.na(co2), !is.na(gdpPercap)) |>
  group_by(Year) |>
  summarise(
    r = cor(co2, gdpPercap),
    .groups = 'drop'
  ) |>
  ggplot() +
  geom_bar(aes(x = Year, y = r),
           stat = 'identity') +
  geom_text(aes(x = Year, y = r, label = round(r, 2)),
                vjust = -0.5) +
  scale_x_continuous(breaks = sort(unique(data_frame$Year))) +
  theme(legend.position = 'none')

library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot = data_frame |>
  filter(Year == 1967) |>
  select( co2 = starts_with('CO2'),
          gdpPercap,
          pop,
          continent) |>
  ggplot() +
  geom_point(aes(x = gdpPercap, y = co2, size = pop, color = continent)) +
  scale_x_continuous(
    labels = label_dollar(scale = 1/1000, suffix = 'K')
  ) +
  coord_cartesian(
    xlim = c(0,20000),
    ylim = c(0,25)
  ) +
  labs(
    x = 'GDP per capita',
    y = 'CO2 emissions (metric tons per capita)'
    )

ggplotly(plot) 

stat test

##What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)

### box plot for energy use accross continent 
### run a linear regression model 
### run anova

fit = lm(`Energy use (kg of oil equivalent per capita)`~continent, data = data_frame)
anova(fit)
## Analysis of Variance Table
## 
## Response: Energy use (kg of oil equivalent per capita)
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## continent   4  771482483 192870621  51.459 < 2.2e-16 ***
## Residuals 843 3159591816   3748033                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I used ANOVA to test whether there are significant differences in energy use across continents, under the assumption that the average energy use is equal for all continents.

Null Hypothesis H0: there isn’t a difference in energy use across continent

Alternative Hypothesis H1: at least one group mean energy use is different

the threshold for p-value is set to 0.05

The fourth column displays the p-value, which indicates the probability—under the null hypothesis—of observing an F-value as extreme as the one obtained. the p_value corresponds to 8.53 × 10⁻³, thus we reject the null hypothesis.

if we want to compare individual groups, we can use a contrast matrix.

##Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)

since we are comparing two independents groups. we will use t-test for comparsion of group means.

Null Hypothesis: There is no significant difference in imports as a percentage of GDP between Asia and Europe. Alternative Hypothesis: The mean import percentage of GDP differs between Asia and Europe.


euro_asia = data_frame |>
  filter(Year > 1990) |>
  filter(continent == 'Asia' | continent == 'Europe') |>
  rename(import_goods = starts_with('Import')) |>
  select(continent, import_goods) |>
  filter(if_all(everything(), ~ !is.na(import_goods)))


t_test = t.test(import_goods ~ continent, data = euro_asia)$p.value

since (p > 0.05), we will accept the Null Hypothesis that there isn’t a significant difference between Asia and Europe in terms of Import of Goods and Services as a percentage of GDP.

##What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)